Summary

This code follows publications_LENS_preprocessing.Rmd which processes the list of the 1019 included articles stored in 20201215_EGM_Net_all-articles_clean.csv and runs a query via LENS API custom function to retrieve their detailed bibliographic information. Retrieved data is processed to remove duplicated records, resulting in 974 records with unique titles. The data is stored as a hierarchical data object with multiple levels of nested data tables (record_df in LENS_dataframe.RData).

This code produces collaboration plots and statistics using igraph package.

Load data

#load(file = here("data", "LENS_dataframe_cleaned.RData")) #loads semi-cleaned LENS output from a Rdata object - "record_df"

dat <- read.csv(here("data", "20201215_EGM_Net_all-articles_clean.csv"))
dim(dat) #1019   16
## [1] 1019   16
#names(dat)
#hist(dat$year)

# #initial checks
# length(unique(dat$Title)) #1019
# title <- unique(dat$Title)
# length(unique(dat$DOI)) #only 184 DOI values and missing the value is stored as ""! (change to NA later?)
# table(dat$Item_Type) #mostly articles
# hist(dat$Pub_Year, breaks = 70)
# table(dat$Manual_Tags) #stored as vectors of characters, tags separated by "; "
# #table(dat$Item_Type)["journalArticle"] # 959 journal articles

Use igraph to create collaboration networks for authors

#load preprocessed LENS data frame with cleaned authors names and ids
record_df_data.authors.ids <- read.csv(here("data", "record_df_data.authors.ids_cleaned.csv")) #loads cleaned authors dataframe from csv


#prepare data frame for igraph
dt <- data.frame(pub.id = record_df_data.authors.ids$data.lens_id, Author = record_df_data.authors.ids$Author, value = record_df_data.authors.ids$value)
#str(dt)

dt %>%
  inner_join(dt, by = "pub.id") %>%
  filter(value.x < value.y) %>%
  count(Author.x, Author.y) %>%
  graph_from_data_frame(directed = FALSE) -> g1 #with author names as vertices

# dt %>%
#   inner_join(dt, by = "pub.id") %>%
#   filter(Author.x < Author.y) %>%
#   count(value.x, value.y) %>%
#   graph_from_data_frame(directed = FALSE) -> g1 #with ids as vertices

#see components of the igraph object
# E(g1)
# V(g1)
# g1[]
# edge_attr(g1)
# vertex_attr(g1)
# as_data_frame(g1, what = "edges")

#plot(g1) #basic plot - not readable
plot(g1, edge.arrow.size=0, vertex.color="gold", vertex.size=5, 
     vertex.frame.color="gray", vertex.label.color="black", 
     vertex.label.cex=0.8, vertex.label.dist=2, edge.curved=0.2, vertex.label=NA)

g1s <- simplify( g1, remove.multiple = T, remove.loops = T, 
                 edge.attr.comb=list(weight="sum", "ignore") ) #simplify by removing loops and merging overlapping edges
plot(g1s, edge.arrow.size=0, vertex.color="gold", vertex.size=5, vertex.frame.color="gray", 
     vertex.label.color="black", vertex.label.cex=0.8, vertex.label.dist=2, edge.curved=0.2, vertex.label=NA)

E(g1s)$weight <- 1 #add weights of 1 to each connection
E(g1s)$width <- E(g1s)$weight*2 #add weights to edges

#grep("^layout_", ls("package:igraph"), value=TRUE)[-1]  #list of available layouts 
plot(g1s, layout=layout_nicely, edge.arrow.size=0.0, vertex.color="gold", vertex.size=2,
     vertex.frame.color="gray", vertex.label.color="black", vertex.label=NA) # plotting with specific layout and without node labels

##Find cliques (complete subgraphs of an undirected graph)
# g1s_cliques <- cliques(as.undirected(g1s), min=10)# list of cliques       
# hist(sapply(g1s_cliques, length)) # histogram of clique sizes
# g1s_cliques_largest <- largest_cliques(as.undirected(g1s)) # cliques with max number of nodes
# vcol <- rep("grey80", vcount(as.undirected(g1s)))
# vcol[unlist(g1s_cliques_largest)] <- "gold"
# plot(as.undirected(g1s), layout=layout_nicely, vertex.color=vcol, vertex.size=2, vertex.label=NA)

##Community detection based on edge betweenness (Newman-Girvan)
##High-betweenness edges are removed sequentially (recalculating at each step) and the best partitioning of the network is selected.
ceb <- cluster_edge_betweenness(as.undirected(g1s)) 
length(ceb) #279 communities
## [1] 279
#membership(ceb) #authors and community number
modularity(ceb) #high modularity 0.967
## [1] 0.9668105
plot(ceb, as.undirected(g1s), layout=layout_nicely, vertex.size=2, vertex.label=NA)

#Community detection based on based on propagating labels
clp <- cluster_label_prop(g1s)
plot(clp, g1s, layout=layout_nicely, edge.arrow.size=0.0, vertex.size=2, vertex.label=NA)

#Community detection based on greedy optimization of modularity
cfg <- cluster_fast_greedy(as.undirected(g1s))
plot(cfg, as.undirected(g1s), layout=layout_nicely, edge.arrow.size=0.0, vertex.size=2, vertex.label=NA)

length(cfg) #280 communities
## [1] 280
modularity(cfg) #high modularity
## [1] 0.9650914
#order(sizes(cfg), decreasing=TRUE)
order(sizes(cfg), decreasing=TRUE)[1] # largest community nr1
## [1] 1
members_1 <- membership(cfg)[membership(cfg)==1] #members of the community nr1
names(members_1)
##  [1] "Abdul-Sada, A"         "Allen, D"              "Amy, SR"              
##  [4] "Badenhausser, I"       "Baldock, KCR"          "Blake, RJ"            
##  [7] "Botham, MS"            "Botías, C"             "Bourke, AFG"          
## [10] "Broughton, RK"         "Brown, VK"             "Bruneteau, L"         
## [13] "Carvell, C"            "Chauvel, B"            "Craze, PG"            
## [16] "Critchley, CNR"        "Croxton, PJ"           "David, A"             
## [19] "Dover, JW"             "Eggleton, P"           "Feltham, H"           
## [22] "Firbank, LG"           "Fowbert, JA"           "Freeman, SN"          
## [25] "Fuentes-Montemayor, E" "George, C"             "Goulson, D"           
## [28] "Gross, N"              "Gundrey, AL"           "Hann, JP"             
## [31] "Harris, SJ"            "Harrison-Cripps, J"    "Heard, MS"            
## [34] "Herbert, I"            "Hill, EM"              "Holland, JM"          
## [37] "Horwood, J"            "Hulmes, L"             "Hulmes, S"            
## [40] "James, KL"             "Lambert, M"            "Large, R"             
## [43] "Loxton, D"             "Lye, GC"               "Meek, B"              
## [46] "Meek, WR"              "Memmott, J"            "Mountford, JO"        
## [49] "Noordhuis, R"          "Nowakowski, M"         "Nuttall, P"           
## [52] "Osborne, JL"           "Parish, T"             "Park, KJ"             
## [55] "Parkinson, AE"         "Petit, S"              "Peyton, J"            
## [58] "Pilgrim, ES"           "Potts, SG"             "Pywell, RF"           
## [61] "Ramsay, AJ"            "Ramsey, AJ"            "Reboud, X"            
## [64] "Redhead, JW"           "Rose, R"               "Sherwood, A"          
## [67] "Smith, REN"            "Sparks, TH"            "Staley, JT"           
## [70] "Tallowin, JRB"         "Tscheulin, T"          "Turk, A"              
## [73] "Vanbergen, AJ"         "Vandier, M"            "Walker, KJ"           
## [76] "Warman, EA"            "Westbury, DB"          "Woodcock, BA"         
## [79] "Nicholls, E"           "Mole, AC"              "Hartley, SE"          
## [82] "Shore, RF"             "Cordeau, S"            "Sutton, P"            
## [85] "Tallowin, JR"          "Facey, SL"             "Bell, D"              
## [88] "Greatorex-Davies, JN"  "Loxton, RG"            "Westrich, P"          
## [91] "Sherwood, AJ"          "Smith, J"              "Minderman, J"         
## [94] "Thomas, SR"            "Pickett, H"            "Hinsley, SA"          
## [97] "Garbutt, RA"
members_2 <- membership(cfg)[membership(cfg)==2] #members of the community nr2
names(members_2)
##  [1] "Alignier, A"           "Batáry, P"             "Baudry, J"            
##  [4] "Besson, MM"            "Bommarco, R"           "Burel, F"             
##  [7] "Butet, A"              "C.F.G., T"             "Codreanu, P"          
## [10] "Dänhardt, J"           "de la Peña, NM"        "Delattre, P"          
## [13] "Delattre, T"           "Delettre, YR"          "Denys, C"             
## [16] "Derocles, SAP"         "Dormann, CF"           "Ekroos, J"            
## [19] "Ernoult, A"            "Evans, DM"             "Fischer, C"           
## [22] "Földesi, R"            "Fründ, J"              "Garcia, AF"           
## [25] "García, AF"            "Georgelin, E"          "Gerowitt, B"          
## [28] "Griffiths, GJK"        "Haenke, S"             "Hassan, DA"           
## [31] "Holzschuh, A"          "Hu, H"                 "Jambon, O"            
## [34] "Jauker, B"             "Jönsson, A"            "Kindlmann, P"         
## [37] "Kleijn, D"             "Krewenka, KM"          "Larroudé, P"          
## [40] "Le Coeur, D"           "Le Cœur, D"            "Le Ralec, A"          
## [43] "Ludwig, M"             "Maret, M"              "Marshall, EJP"        
## [46] "Matthiesen, T"         "Michel, N"             "Miot, S"              
## [49] "Moonen, AC"            "Morellet, N"           "Öberg, S"             
## [52] "Olsson, O"             "Paillat, G"            "Paoletti, M"          
## [55] "Parkinson, L"          "Plantegenest, M"       "Poggi, S"             
## [58] "Prentice, HC"          "Riedinger, V"          "Roberts, SPM"         
## [61] "Saussure, S"           "Schaefer, M"           "Scheid, BE"           
## [64] "Scheper, J"            "Schlinkert, H"         "Sennedot, F"          
## [67] "Smith, HG"             "Steffan-Dewenter, I"   "Street, TI"           
## [70] "Thies, C"              "Thomas, CFG"           "Tiainen, J"           
## [73] "Trnka, A"              "Tscharntke, T"         "Vernon, P"            
## [76] "Vialatte, A"           "Walton, A"             "Wickens, JB"          
## [79] "Wickens, VJ"           "Kovács-Hostyánszki, A" "Scherber, C"          
## [82] "Fargue-Lelièvre, A"    "Joenje, W"             "Quéré, JP"            
## [85] "Thenail, C"            "Ekbom, B"              "Rundlöf, NA"          
## [88] "Morant, P"             "Rantier, Y"            "Andersson, GKS"       
## [91] "Piha, M"               "Verbeek, M"            "West, TM"             
## [94] "Thibord, J"            "Hall, K"               "Bäckman, J"
#remove all nodes not from community1,  by name
g1s1 <- delete_vertices(g1s, names(membership(cfg)[membership(cfg)!=1]))
plot(g1s1, layout=layout_nicely, edge.arrow.size=0, vertex.color="gold", vertex.size=5, vertex.frame.color="gray", 
     vertex.label.color="black", vertex.label.cex=0.8, vertex.label.dist=2, edge.curved=0.2) #plot of largest community (nr1)

#gather information related to authors in community1:
members_1_data <- subset(record_df_data.authors.ids, record_df_data.authors.ids$Author %in% names(members_1))
dim(members_1_data) #more rows than authors, as it also has information on every associated publication (267)
## [1] 267  11
length(unique(members_1_data$Author)) #97 authors
## [1] 97
length(unique(members_1_data$data.title)) #59 titles
## [1] 59
#Note: in order to get affliation data it needs to be merged with info from record_df_data.authors.aff (loaded in the next chunk, author details not cleaned)

#plot_dendrogram(ceb, mode="hclust") #dendogram plot - too dense!

#more at: https://kateto.net/netscix2016.html

Use igraph to create collaboration networks for countries

Countries codes are mostly cleaned now. We use two-letter country codes (fips) from https://www.geonames.org/countries/.

record_df_data.authors.aff <- read.csv(here("data", "record_df_data.authors.aff_cleaned.csv"))
#names(record_df_data.authors.aff)

record_df_data.authors.aff$Author <- paste(record_df_data.authors.aff$last_name, record_df_data.authors.aff$initials, sep=", ") #this would need some extra cleaning to perfectly match Author field in record_df_data.authors.ids

#prepare data frame for igraph
dtc <- data.frame(pub.id = record_df_data.authors.aff$data.lens_id, Author = record_df_data.authors.aff$Author, value = record_df_data.authors.aff$country_code)
#str(dtc)

dtc %>%
  inner_join(dtc, by = "pub.id") %>%
  filter(Author.x < Author.y) %>%
  count(value.x, value.y) %>%
  graph_from_data_frame(directed = FALSE) -> g1c

#as_data_frame(g1c, what = "edges")

# plot(g1c)
# plot(g1c, edge.arrow.size=0, vertex.color="gold", vertex.size=5, 
#      vertex.frame.color="gray", vertex.label.color="black", 
#      vertex.label.cex=0.8, vertex.label.dist=2, edge.curved=0.2)

#E(g1c)$weight <- 1 #add weights

g1cs <- simplify(g1c, remove.multiple = T, remove.loops = T, 
                 edge.attr.comb=list(weight="sum", "ignore") ) #simplify
plot(g1cs, layout=layout_nicely, edge.arrow.size=0, vertex.color="gold", vertex.size=10, vertex.frame.color="gray", 
     vertex.label.color="black", vertex.label.cex=0.8, vertex.label.dist=0, edge.curved=0.2)

#E(g1cs)
#V(g1cs)

##Find cliques (complete subgraphs of an undirected graph)
# cliques(as.undirected(g1cs)) # list of cliques       
# sapply(cliques(as.undirected(g1cs)), length) # clique sizes
# largest_cliques(as.undirected(g1cs)) # clique with max number of nodes
# vcol <- rep("grey80", vcount(as.undirected(g1cs)))
# vcol[unlist(largest_cliques(as.undirected(g1cs)))] <- "gold"
# plot(as.undirected(g1cs), vertex.label=V(g1cs)$name, vertex.color=vcol)

##Community detection based on edge betweenness (Newman-Girvan)
##High-betweenness edges are removed sequentially (recalculating at each step) and the best partitioning of the network is selected.
cebc <- cluster_edge_betweenness(as.undirected(g1cs)) 
#dendPlot(cebi, mode="hclust") #too dense
plot(cebc, as.undirected(g1cs), layout=layout_nicely, edge.arrow.size=0.0, vertex.color="gold", vertex.size=2,
     vertex.frame.color="grey", vertex.label.color="black") #, vertex.label=NA

#more at: https://kateto.net/netscix2016.html, pretty_plots.R

Use igraph to create collaboration networks for institutions

Note: needs to be redone after cleaning and imputing instituton data. https://www.grid.ac/ Global Research Identifier Database (GRID) with institution names (affiliation name).

record_df_data.authors.aff <- read.csv(here("data", "record_df_data.authors.aff_cleaned.csv"))
#dim(record_df_data.authors.aff) #2968

## make a subset without missing grid_id data:
record_df_data.inst <- record_df_data.authors.aff[!is.na(record_df_data.authors.aff$grid_id), ] 
#dim(record_df_data.inst) #2188 records

#using country code and institution grid.id combined as a unique identifier
record_df_data.inst$country_grid.id <- paste(record_df_data.inst$country_code, record_df_data.inst$grid_id, sep=", ")
record_df_data.inst$Author <- paste(record_df_data.inst$last_name, record_df_data.inst$initials, sep=", ")

#prepare data frame for igraph
dti <- data.frame(pub.id = record_df_data.inst$data.lens_id, Author = record_df_data.inst$Author, value = record_df_data.inst$country_grid.id)
#str(dti)

dti %>%
  inner_join(dti, by = "pub.id") %>%
  filter(Author.x < Author.y) %>%
  count(value.x, value.y) %>%
  graph_from_data_frame(directed = FALSE) -> g1i

#as_data_frame(g1i, what = "edges")

# plot(g1i)
# plot(g1i, edge.arrow.size=0, vertex.color="gold", vertex.size=5, 
#      vertex.frame.color="gray", vertex.label.color="black", 
#      vertex.label.cex=0.8, vertex.label.dist=2, edge.curved=0.2)

#E(g1i)$weight <- 1 #add weights
g1is <- simplify(g1i, remove.multiple = T, remove.loops = T, 
                 edge.attr.comb=list(weight="sum", "ignore") ) #simplify
plot(g1is, layout=layout_nicely, edge.arrow.size=0, vertex.color="gold", vertex.size=5, vertex.frame.color="gray", 
     vertex.label.color="black", vertex.label.cex=0.8, vertex.label.dist=2, edge.curved=0.2)

#E(g1is)
#V(g1is)

##Find cliques (complete subgraphs of an undirected graph)
# cliques(as.undirected(g1is)) # list of cliques       
# sapply(cliques(as.undirected(g1is)), length) # clique sizes
largest_cliques(as.undirected(g1is)) # clique with max number of nodes
## [[1]]
## + 5/299 vertices, named, from 190d589:
## [1] ES, grid.5319.e CH, grid.5333.6 GB, grid.8250.f NL, grid.5477.1
## [5] RO, grid.5100.4
## 
## [[2]]
## + 5/299 vertices, named, from 190d589:
## [1] US, grid.463419.d US, grid.34421.30 US, grid.472551.0 ES, grid.4711.3  
## [5] US, grid.167436.1
## 
## [[3]]
## + 5/299 vertices, named, from 190d589:
## [1] UK, grid.423196.b GB, grid.421630.2 GB, grid.1006.7   GB, grid.11835.3e
## [5] GB, grid.4991.5  
## 
## [[4]]
## + 5/299 vertices, named, from 190d589:
## [1] SK, grid.412903.d DE, grid.7450.6   DE, grid.8379.5   DE, grid.9122.8  
## [5] HU, grid.5591.8  
## 
## [[5]]
## + 5/299 vertices, named, from 190d589:
## [1] SE, grid.6341.0 GB, grid.9435.b DE, grid.8379.5 NL, grid.4818.5
## [5] SE, grid.4514.4
## 
## [[6]]
## + 5/299 vertices, named, from 190d589:
## [1] RO, grid.5100.4   GB, grid.8250.f   NL, grid.5477.1   ES, grid.5841.8  
## [5] PL, grid.413454.3
## 
## [[7]]
## + 5/299 vertices, named, from 190d589:
## [1] GB, grid.8391.3   GB, grid.418374.d GB, grid.43641.34 GB, grid.9835.7  
## [5] GB, grid.5491.9  
## 
## [[8]]
## + 5/299 vertices, named, from 190d589:
## [1] GB, grid.8391.3   GB, grid.418374.d GB, grid.43641.34 GB, grid.9835.7  
## [5] GB, grid.421944.e
# vcol <- rep("grey80", vcount(as.undirected(g1is)))
# vcol[unlist(largest_cliques(as.undirected(g1is)))] <- "gold"
# plot(as.undirected(g1is), vertex.label=V(g1is)$name, vertex.color=vcol)

##Community detection based on edge betweenness (Newman-Girvan)
##High-betweenness edges are removed sequentially (recalculating at each step) and the best partitioning of the network is selected.
cebi <- cluster_edge_betweenness(as.undirected(g1is)) 
#dendPlot(cebi, mode="hclust") #too dense
plot(cebi, as.undirected(g1is), layout=layout_nicely, edge.arrow.size=0.0, vertex.color="gold", vertex.size=2,
     vertex.frame.color="grey", vertex.label.color="black", vertex.label=NA)

#more at: https://kateto.net/netscix2016.html, pretty_plots.R